-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[mlir][xegpu] Add definitons of MatrixDescType and related ops. #153273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Chao Chen (chencha3) ChangesThis PR adds definition of Full diff: https://github.com/llvm/llvm-project/pull/153273.diff 7 Files Affected:
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 1a6a34c8d775a..f536650e9d872 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1101,4 +1101,110 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
let hasCanonicalizer = 1;
}
+def isSharedPred : CPred<"isSharedMemory(llvm::cast<mlir::MemRefType>($_self))">;
+class StaticShared1DMemRefOf<list<Type> allowedTypes> :
+ ConfinedType<MemRefRankOf<allowedTypes, [1]>, [HasStaticShapePred, isSharedPred],
+ "statically shaped " # MemRefOf<allowedTypes>.summary # " for shared memory",
+ "mlir::MemRefType">;
+
+class SizeInBits<string name> :
+ StrFunc<"llvm::cast<mlir::ShapedType>($" # name # ".getType()).getNumElements()"
+ "*llvm::cast<mlir::ShapedType>($" # name # ".getType()).getElementTypeBitWidth()">;
+class AllMemSizesMatch<list<string> names> :
+ AllMatchSameOperatorTrait<names, SizeInBits<"_self">.result,
+ "size in bits">;
+
+def XeGPU_CreateMatrixDescOp: XeGPU_Op<"create_matrix_desc", [Pure,
+ AllMemSizesMatch<["source", "matrix_desc"]>]> {
+ let summary = "Create a matrix descriptor.";
+ let description = [{
+ Creates a matrix descriptor from a shared local memory (SLM) buffer.
+ The resulting matrix descriptor has to have the same size as the underlying
+ shared local memory.
+
+ Arguments:
+ - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer.
+ Results:
+ - `matrix_desc` : the matrix descriptor.
+ }];
+ let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source);
+ let results = (outs XeGPU_MatrixDesc:$matrix_desc);
+ let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($matrix_desc))";
+}
+
+def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
+ AllElementTypesMatch<["matrix_desc", "res"]>,
+ AllRanksMatch<["matrix_desc", "res"]>]> {
+ let arguments = (ins XeGPU_MatrixDesc:$matrix_desc,
+ Variadic<Index>: $offsets,
+ DenseI64ArrayAttr: $const_offsets,
+ OptionalAttr<LayoutTrait>:$layout
+ );
+ let results = (outs XeGPU_ValueType:$res);
+ let assemblyFormat = [{
+ $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
+ prop-dict attr-dict `` `:` type(operands) `->` type(results)
+ }];
+
+ let description = [{
+ This operation reads a block of data from shared local memory (SLM)
+ using the provided matrix descriptor.
+
+ Arguments:
+ - `matrix_desc`: the matrix descriptor identifying the SLM region.
+ - `offsets`: the coordinates within the matrix to read from.
+ Results:
+ - `res`: the matrix elements loaded from SLM.
+ }];
+
+ let builders = [
+ OpBuilder<(ins "Type":$res, "TypedValue<MatrixDescType>": $matrix_desc,
+ "llvm::ArrayRef<OpFoldResult>": $offsets, "LayoutTrait": $layout)>,
+ ];
+ let extraClassDeclaration = [{
+ SmallVector<OpFoldResult> getMixedOffsets() {
+ return getMixedValues(getConstOffsets(), getOffsets(), getContext());
+ }
+ }];
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
+ AllElementTypesMatch<["matrix_desc", "data"]>,
+ AllRanksMatch<["matrix_desc", "data"]>]> {
+ let arguments = (ins
+ XeGPU_MatrixDesc:$matrix_desc,
+ Variadic<Index>: $offsets,
+ DenseI64ArrayAttr: $const_offsets,
+ XeGPU_ValueType:$data,
+ OptionalAttr<LayoutTrait>:$layout
+ );
+ let assemblyFormat = [{
+ $matrix_desc `` custom<DynamicIndexList>($offsets, $const_offsets) `,` $data
+ prop-dict attr-dict `:` type(operands)
+ }];
+ let description = [{
+ This operation writes the `data` fragment into the shared local memory region
+ identified by `matrix_desc`.
+
+ Arguments:
+ - `matrix_desc`: the matrix descriptor specifying the SLM region.
+ - `offsets`: the coordinates within the matrix where the data will be written.
+ - `data`: the values to be stored in the matrix.
+ }];
+ let builders = [
+ OpBuilder<(ins "TypedValue<MatrixDescType>": $matrix_desc, "llvm::ArrayRef<OpFoldResult>": $offsets,
+ "Value" : $data, "LayoutTrait": $layout)>,
+ ];
+ let extraClassDeclaration = [{
+ SmallVector<OpFoldResult> getMixedOffsets() {
+ return getMixedValues(getConstOffsets(), getOffsets(), getContext());
+ }
+ }];
+
+ let hasVerifier = 1;
+}
+
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index b268cabb5d266..02cabce82398b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -201,4 +201,26 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
}];
}
+def XeGPU_MatrixDesc: XeGPUTypeDef<"MatrixDesc", "matrix_desc", [ShapedTypeInterface], "mlir::Type"> {
+ let summary = "MatrixDesc describing the data in SLM";
+ let description = [{
+ MatrixDesc represents a block of data stored in shared local memory.
+ By default, unless a layout attribute is provided, the data is stored
+ contiguously in row-major order within the region.
+ }];
+ let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+ "mlir::Type": $elementType,
+ OptionalParameter<"mlir::Attribute">: $layout);
+
+ let extraClassDeclaration = [{
+ bool hasRank() const { return true; }
+
+ MatrixDescType cloneWith(std::optional<llvm::ArrayRef<int64_t>> shape, Type elementType) const {
+ return MatrixDescType::get(getContext(), shape.value_or(getShape()), elementType, getLayout());
+ }
+ }];
+
+ let hasCustomAssemblyFormat = true;
+}
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 7c6a4f37db9af..603fb5d237544 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -17,6 +17,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
MLIRAffineUtils
MLIRArithUtils
MLIRDialectUtils
+ MLIRGPUDialect
MLIRIR
MLIRViewLikeInterface
MLIRVectorDialect
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index d997296a22c20..ac9e994d4872c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -591,6 +591,62 @@ LogicalResult TensorDescType::verify(
return success();
}
+//===----------------------------------------------------------------------===//
+// XeGPU_MatrixDescType
+//===----------------------------------------------------------------------===//
+mlir::Type MatrixDescType::parse(::mlir::AsmParser &parser) {
+ llvm::SmallVector<int64_t> shape;
+ mlir::Type elementType;
+ mlir::FailureOr<mlir::Attribute> layout;
+
+ // Parse literal '<'
+ if (parser.parseLess())
+ return {};
+
+ auto shapeLoc = parser.getCurrentLocation();
+ if (mlir::failed(parser.parseDimensionList(shape, false, true))) {
+ parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+ return {};
+ }
+
+ auto elemTypeLoc = parser.getCurrentLocation();
+ if (mlir::failed(parser.parseType(elementType))) {
+ parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+ return {};
+ }
+
+ // parse optional attributes
+ if (mlir::succeeded(parser.parseOptionalComma())) {
+ mlir::Attribute attr;
+ ParseResult res = parser.parseAttribute(attr);
+ if (mlir::failed(res))
+ return {};
+ layout = attr;
+ }
+
+ // Parse literal '>'
+ if (parser.parseGreater())
+ return {};
+
+ MLIRContext *ctxt = parser.getContext();
+ return MatrixDescType::getChecked(
+ [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape,
+ elementType, layout.value_or(mlir::Attribute()));
+}
+
+void MatrixDescType::print(::mlir::AsmPrinter &printer) const {
+ printer << "<";
+
+ printer.printDimensionList(getShape());
+ printer << 'x';
+ printer << getElementType();
+
+ if (auto layout = getLayout())
+ printer << ", " << layout;
+
+ printer << ">";
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 2cd086feb5deb..2051d7030340e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -21,6 +22,15 @@
namespace mlir {
namespace xegpu {
+bool isSharedMemory(const MemRefType &memrefTy) {
+ Attribute attr = memrefTy.getMemorySpace();
+ if (auto intAttr = llvm::dyn_cast<IntegerAttr>(attr))
+ return intAttr.getInt() == 3;
+ if (auto memrefSpace = llvm::dyn_cast<MemorySpaceAttr>(attr))
+ return memrefSpace.getValue() == MemorySpace::SLM;
+ return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
+}
+
template <typename T>
static std::string makeString(T array, bool breakline = false) {
std::string buf;
@@ -925,6 +935,59 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
patterns.add<FoldConvertLayoutOp>(context);
}
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadMatrixOp
+//===----------------------------------------------------------------------===//
+void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
+ TypedValue<MatrixDescType> matrixDesc,
+ llvm::ArrayRef<OpFoldResult> offsets,
+ LayoutTrait layout) {
+ llvm::SmallVector<Value> dynamicOffsets;
+ llvm::SmallVector<int64_t> staticOffsets;
+
+ dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+ auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+
+ build(builder, state, res, matrixDesc, dynamicOffsets, staticOffsetsAttr,
+ layout);
+}
+
+LogicalResult LoadMatrixOp::verify() {
+ ArrayRef<int64_t> valueShape = getRes().getType().getShape();
+ ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape();
+ if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape),
+ [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+ return emitOpError("result shape must not exceed matrix desc shape.");
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreMatrixOp
+//===----------------------------------------------------------------------===//
+void StoreMatrixOp::build(OpBuilder &builder, OperationState &state,
+ TypedValue<MatrixDescType> matrixDesc,
+ llvm::ArrayRef<OpFoldResult> offsets, Value data,
+ LayoutTrait layout) {
+ llvm::SmallVector<Value> dynamicOffsets;
+ llvm::SmallVector<int64_t> staticOffsets;
+
+ dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+ auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+
+ build(builder, state, matrixDesc, dynamicOffsets, staticOffsetsAttr, data,
+ layout);
+}
+
+LogicalResult StoreMatrixOp::verify() {
+ ArrayRef<int64_t> dataShape = getData().getType().getShape();
+ ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape();
+ if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
+ [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+ return emitOpError("data shape must not exceed matrix desc shape.");
+
+ return success();
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 44e15dd7cbb38..2feb010d343a8 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -762,3 +762,47 @@ func.func @slice_attr_repeat_dim() {
return
}
+// -----
+func.func @create_matrix_desc_non_slm() {
+ %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 1>
+ // expected-error@+1 {{operand #0 must be statically shaped memref of 8-bit signless integer values for shared memory}}
+ %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 1> -> !xegpu.matrix_desc<16x64xf16>
+ return
+}
+
+// -----
+func.func @create_matrix_desc_mismatch_sizes() {
+ %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+ // expected-error@+1 {{failed to verify that all of {source, matrix_desc} have same size in bits}}
+ %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x32xf16>
+ return
+}
+
+// -----
+func.func @load_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+ // expected-error@+1 {{failed to verify that all of {matrix_desc, res} have same element type}}
+ %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf32>
+ return
+}
+
+// -----
+func.func @load_matrix_desc_invalid_result_size(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+ // expected-error@+1 {{result shape must not exceed matrix desc shape}}
+ %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<32x16xf16>
+ return
+}
+
+// -----
+func.func @store_matrix_desc_mismatch_element_type(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
+ // expected-error@+1 {{failed to verify that all of {matrix_desc, data} have same element type}}
+ xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf32>
+ return
+}
+
+// -----
+func.func @store_matrix_desc_invalid_data_size(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<32x32xf16>) {
+ // expected-error@+1 {{data shape must not exceed matrix desc shape}}
+ xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<32x32xf16>
+ return
+}
+
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 67c00f5a9cc2f..cda8f0ac1bb40 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -751,4 +751,51 @@ gpu.func @fence() {
gpu.return
}
+// CHECK-LABEL: gpu.func @create_matrix_desc({{.*}}) {
+gpu.func @create_matrix_desc() {
+ //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
+ //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16>
+ %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+ %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @create_matrix_desc_with_stride({{.*}}) {
+gpu.func @create_matrix_desc_with_stride() {
+ //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3>
+ //CHECK: [[mdesc:%.+]] = xegpu.create_matrix_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+ %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3>
+ %matrix_desc = xegpu.create_matrix_desc %m : memref<2048xi8, 3> -> !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @load_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>)
+gpu.func @load_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>) {
+ // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf16>
+ %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> vector<8x16xf16>
+ gpu.return
+}
+
+// CHECK: gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>)
+gpu.func @load_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>) {
+ // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> vector<8x16xf16>
+ %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> -> vector<8x16xf16>
+ gpu.return
+}
+
+
+// CHECK: gpu.func @store_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
+ // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
+ xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16>
+ gpu.return
+}
+
+// CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, %arg1: vector<16x16xf16>) {
+ // CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
+ xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16>
+ gpu.return
+}
+
}
|
mlir/test/Dialect/XeGPU/ops.mlir
Outdated
@@ -751,4 +751,51 @@ gpu.func @fence() { | |||
gpu.return | |||
} | |||
|
|||
// CHECK-LABEL: gpu.func @create_matrix_desc({{.*}}) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a test with layout attribute?
Without layout attribute, matrix_desc is not different from a memref created by memref.view
It appears to me, what sets matrix_desc apart from memref is the optional layout attribute.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, the distribution layout has been moved to the load/store/subview ops. The data layout is kept, and we have a test case for it. Yeah I agree with you that matrix_desc is similar to memref now on Xe2, except matrix_desc is binding to slm only.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also cc @akroviakov |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR description lacks some details regarding how these ops play into the distribution passes, especially since these ops seem to prioritize usage above WI-level, judging by numerous tests with 2D payload.
Will the distribution perform a plain offset and payload shape manipulation?
What about the lowering? Does XeVM or SPIRV beneath it offer an efficient way to execute this op? If so, would the lowering be able to derive all of the needed built-in/intrinsic arguments without any named attributes?
bool isSharedMemory(const MemRefType &memrefTy) { | ||
Attribute attr = memrefTy.getMemorySpace(); | ||
if (auto intAttr = llvm::dyn_cast<IntegerAttr>(attr)) | ||
return intAttr.getInt() == 3; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static_cast<int>(xevm::AddrSpace::SHARED)
seems more appropriate.
https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td#L329
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added support for this.
LogicalResult LoadMatrixOp::verify() { | ||
ArrayRef<int64_t> valueShape = getRes().getType().getShape(); | ||
ArrayRef<int64_t> mdescShape = getMatrixDesc().getType().getShape(); | ||
if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does AllShapesMatch
in .td
definition not suit this purpose?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Per latest definition, it can load or store a smaller shape of data from a bigger MatrixDesc. So AllShapesMatch doesn't fit here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, sorry, missed the >
in lambda.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no worries
The MatrixDescType doesn't have distribution layout. Instead, each op (load_matrix/store_matrix) has an optional layout attribute for this purpose. The doc is updated to reveal this. |
let summary = "MatrixDesc describing the data in SLM"; | ||
let description = [{ | ||
MatrixDesc represents a block of data stored in shared local memory. | ||
By default, unless a layout attribute is provided, the data is stored |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is this layout?
I assume this is not distribution layout as you said it is not part of matrix desc type.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the tests, there is usage like
!xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>
Is strided<[1,16]>
the layout attribute?
Is that attribute type not some that can be represented by
MemRefLayoutAttrInterface
used in MemRefType ?
https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/IR/BuiltinTypes.td#L796-#L801
Looks very similar to MemRefType. For example,
memref<12x4xf32, strided<[4, 1], offset: 5>>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please refer to intel/mlir-extensions#1092 for the motivation and explanation of the slm memory layout of matrix descriptor.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 to the questions here. It just reads as a MemRef layout which is fine but could use explicit clarification as layout
term becomes quite overloaded within the dialect now.
Could you add more description here? Or at least an example snippet.
mlir/test/Dialect/XeGPU/ops.mlir
Outdated
// CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>) | ||
gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, %arg1: vector<16x16xf16>) { | ||
// CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16> | ||
xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, vector<16x16xf16> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This store out-of-bound place?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe keep vector<16x16xf16> as vector<8x16xf16>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good catch, fixed.
mlir/test/Dialect/XeGPU/ops.mlir
Outdated
// CHECK: gpu.func @store_matrix_desc([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>) | ||
gpu.func @store_matrix_desc(%arg0: !xegpu.matrix_desc<16x64xf16>, %arg1: vector<16x16xf16>) { | ||
// CHECK: xegpu.store_matrix [[ARG0]][8, 8], [[ARG1]] : !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16> | ||
xegpu.store_matrix %arg0[8, 8], %arg1: !xegpu.matrix_desc<16x64xf16>, vector<16x16xf16> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we want to use the format below to be consistent with xegpu.store
xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
mlir/test/Dialect/XeGPU/ops.mlir
Outdated
// CHECK: gpu.func @matrix_desc_subview([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16>) | ||
gpu.func @matrix_desc_subview(%arg0: !xegpu.matrix_desc<16x64xf16>) { | ||
//CHECK: xegpu.matrix_desc_subview [[ARG0]][8, 8] : !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16> | ||
%data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf16> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
!xegpu.matrix_desc<8x16xf16> needs to have strides=[64, 1]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
mlir/test/Dialect/XeGPU/invalid.mlir
Outdated
// ----- | ||
func.func @matrix_desc_subview_element_type_mismatch(%arg0: !xegpu.matrix_desc<16x64xf16>) { | ||
// expected-error@+1 {{failed to verify that all of {src, res} have same element type}} | ||
%data = xegpu.matrix_desc_subview %arg0[8, 8]: !xegpu.matrix_desc<16x64xf16> -> !xegpu.matrix_desc<8x16xf32> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this one also have an additional error, the strides need to be carried in the subview.
xegpu.matrix_desc<8x16xf32> => xegpu.matrix_desc<8x16xf32, strides=[64, 1]>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
}]; | ||
let parameters = (ins ArrayRefParameter<"int64_t">: $shape, | ||
"mlir::Type": $elementType, | ||
OptionalParameter<"mlir::Attribute">: $layout); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
consider using mem_layout instead of layout, to differentiate with XeGPU.layout which describes the mapping between sg/lane ids to the data.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
mlir/test/Dialect/XeGPU/ops.mlir
Outdated
// CHECK: gpu.func @store_matrix_desc_with_stride([[ARG0:%.+]]: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>) | ||
gpu.func @store_matrix_desc_with_stride(%arg0: !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>>, %arg1: vector<16x16xf16>) { | ||
// CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> | ||
xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.matrix_desc<16x64xf16, strided<[1, 16]>> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the format, would this alternative looks better ?
!xegpu.matrix_desc<16x64xf16, strides=[1, 16], blocksize=[16, 16]>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This format needs two attribute fields. It is a little bit hard to do the extension in downstream. I create a MemLayoutAttr
to encode them, with format as !xegpu.matrix_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>, block = [8, 8]>
, is it good to you?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
overall looks good. leave some minor comments
A high-level question regarding naming, why a AFAIK, it's specifically limited to SLM only as a storage and supports nD layouts. |
let summary = "MatrixDesc describing the data in SLM"; | ||
let description = [{ | ||
MatrixDesc represents a block of data stored in shared local memory. | ||
By default, unless a layout attribute is provided, the data is stored |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 to the questions here. It just reads as a MemRef layout which is fine but could use explicit clarification as layout
term becomes quite overloaded within the dialect now.
Could you add more description here? Or at least an example snippet.
[Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> { | ||
let description = [{ | ||
Creates a subview of a matrix descriptor. The resulting matrix descriptor | ||
may have a lower rank than the source, in which case the dimensions are left-aligned. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does left-aligned
mean?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
emm, a bad description here. I tweaked it a little bit. It means for lower rank result, its dims are mapped the high-order dims of the source. e.g., src: matrix_desc<8x64xf32>, res: matrix<16xf32> (offset [8, 8]), result is taken from the 2nd dimension of the src. it is similar to vector.extract op.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added some examples for MatrixDesc in the description.
|
||
let parameters = (ins "DictionaryAttr": $attrs); | ||
let hasCustomAssemblyFormat = 1; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove extra line
identified by `mem_desc`. | ||
|
||
Arguments: | ||
- `mem_desc`: the matrix descriptor specifying the SLM region. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: matrix -> memory
using the provided matrix descriptor. | ||
|
||
Arguments: | ||
- `mem_desc`: the matrix descriptor identifying the SLM region. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: matrix -> memory
let description = [{ | ||
Creates a matrix descriptor from a shared local memory (SLM) buffer. | ||
The resulting matrix descriptor has to have the same size as the underlying | ||
shared local memory. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shared local memory => memory. The memory descriptor itself doesn't have to associated with share local memory.
def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "mlir::Type"> { | ||
let summary = "MemDesc describing the data in SLM"; | ||
let description = [{ | ||
MemDesc represents a block of data stored in shared local memory. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
consider "a block of data" => "multi-dimensional array buffer". MemDesc associates "structure" information to the buffer (block of data) so it can be viewed as multi-dimension array buffer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This PR adds definition of
MatrixDesc
, a type represents a block of data stored in shared local memory and related load store operations for Intel Xe GPUs.